/* Reads files from csv and creates person-level and firm-level data sets */

#delimit ;
clear all;

local outfile "GWgap_readin_v3b";
set more off;


di _n "$S_DATE $S_TIME";







*1*******************************************************************************;
* HLFS variables at the quarterly frequency;
********************************************************************************;


*1b0*******************************************************************************;
* mapping between occupation and industry codes to use below;

import delimited using "Wgap_HLFS_IDI_v3_recent.csv", varn(1) case(preserve) 
	clear stringc(1 2);

drop if snz_uid=="";
compress hlfs_urd_quarter_date;

save tempin_top, replace;




* making NZSCO 3-digit TO ANZSCO 2-digit mapping for converting occupation;

use hlfs_urd_occ_main_code hlfs_urd_nzsco90_code using tempin_top if hlfs_urd_occ_main_code!="NULL" 
	& hlfs_urd_nzsco90_code!="NULL";
	
rename hlfs_urd_* *;

destring occ_main_code nzsco90_code, force replace;

replace occ_main_code = floor( occ_main_code /10);

gen ct = 1;

collapse (count) ct, by( nzsco90_code occ_main_code);

bys nzsco90_code: egen ct_old = total(ct);

gen fract = ct/ct_old;

bys nzsco (fract): gen biggest = _n==_N;

keep if biggest==1;

keep nzsco90_code occ_main_code;

rename nzsco90_code nzsco90_3d;
rename occ_main_code anzsco06_2d;
save temp_occ_map, replace;




* making NZSCO 3-digit TO ANZSCO 3-digit mapping for converting occupation;

use hlfs_urd_occ_main_code hlfs_urd_nzsco90_code using tempin_top if hlfs_urd_occ_main_code!="NULL" 
	& hlfs_urd_nzsco90_code!="NULL";
	
rename hlfs_urd_* *;

destring occ_main_code nzsco90_code, force replace;

gen ct = 1;

collapse (count) ct, by( nzsco90_code occ_main_code);

bys nzsco90_code: egen ct_old = total(ct);

gen fract = ct/ct_old;

bys nzsco (fract): gen biggest = _n==_N;

sum fract if biggest==1;
keep if biggest==1;

keep nzsco90_code occ_main_code;

rename nzsco90_code nzsco90_3d;
rename occ_main_code anzsco06_3d;
save temp_occ_map3d, replace;





* making ANZSIC 1996 level 3 TO ANZSIC 2006 level 2 mapping for converting industry;

use hlfs_urd_ind_main_code hlfs_urd_anzsic96_code using tempin_top if hlfs_urd_ind_main_code!="NULL" 
	& hlfs_urd_anzsic96_code!="NULL";

rename hlfs_urd_* *;

foreach var in ind_main_code anzsic96_code {;
	replace `var' = "" if `var'=="NULL";
};

replace ind_main_code = trim(ind_main_code);
compress;

gen anzsic06 = substr(ind_main_code,1,3);
drop ind_main_code;

rename anzsic96_code anzsic96;

gen ct = 1;

collapse (count) ct, by(anzsic06 anzsic96);

bys anzsic96: egen ct_old = total(ct);

gen fract = ct/ct_old;

bys anzsic96 (fract): gen biggest = _n==_N;
sum fract if biggest==1;

keep if biggest==1;

keep anzsic96 anzsic06;

rename anzsic96 anzsic96_l3;
rename anzsic06 anzsic06_l2;
save temp_ind_map, replace;








*1b0.5*******************************************************************************;
* ethnicity data for HLFS people;

import delimited using "GWgap_HLFS_eth.csv", varn(1) case(preserve) 
	clear stringc(1);

drop if eth1==.;
compress;

rename eth1 eth_eur;
label var eth_eur "European ethnicity";
rename eth2 eth_mao;
label var eth_mao "Maori ethnicity";
rename eth3 eth_pac;
label var eth_pac "Pacific ethnicity";
rename eth4 eth_asi;
label var eth_asi "Asian ethnicity"; 
rename eth5 eth_mel;
label var eth_mel "Middle Eastern/Latin American/African ethnicity";
rename eth6 eth_oth;
label var eth_oth "Other ethnicity";

egen eth_gp = group(eth*);
bys eth_gp: gen gp_size = _N;

replace eth_gp = 99 if gp_size<1000;
label var eth_gp "Ethnicity combination";
notes eth_gp: All combinations with fewer than 1000 individuals are combined into other. 
	Missing is a category.;
drop gp_size;
compress;
notes: GWgap_HLFS_eth.dta was created on $S_DATE at $S_TIME by `outfile'.do. An observation
	is an individual who appears in the HLFS linked to the IDI, and the data set contains ethnicity
	information from the IDI prioritised source.;
save GWgap_HLFS_eth, replace;





*1b1*******************************************************************************;
* main HLFS data linked to IDI at the person-quarter level;

import delimited using "Wgap_HLFS_IDI_v3_recent.csv", varn(1) case(preserve) 
	clear stringc(1 2);

drop if snz_uid=="";
compress hlfs_urd_quarter_date;

capture drop v73 v74;

rename hlfs_urd_* *;

rename final_wgt_nbr HLFS_wgt;
label var HLFS_wgt "final_weight_nbr from HLFS";


*** date variables;

gen year = substr(quarter_date,1,4);
destring year, replace;
label var year "Year of survey";

gen quarter = substr(quarter_date,6,2);
destring quarter, replace force;
replace quarter = round(quarter/3);
label var quarter "Quarter of survey";

gen fin_yr = year if quarter==1;
replace fin_yr = year + 1 if quarter>=2;
label var fin_yr "Financial year (ending March)";

drop quarter_date;

order snz_uid year quarter;


*** usual hours worked;

rename usual_hr_main_nbr hrs_main;
label var hrs_main "Hours usually worked in 1st job";
capture replace hrs_main = "" if hrs_main=="NULL";
destring hrs_main, replace;
replace hrs_main = 99 if hrs_main>99 & hrs_main<.;
notes hrs_main: These are capped at 99.;

foreach var in usual_hr_sec_nbr usual_hr_oth_nbr {;
	destring `var', force replace;
};
replace ps_qual_sold_code = "" if ps_qual_sold_code=="NULL";

rename usual_hr_sec_nbr hrs_oth;
replace hrs_oth = hrs_oth + usual_hr_oth_nbr if usual_hr_oth_nbr<.;
label var hrs_oth "Hours usually worked in other jobs";
replace hrs_oth = 99 if hrs_oth>99 & hrs_oth<.;
notes hrs_oth: These are capped at 99.;

gen hrs_tot = hrs_main;
replace hrs_tot = hrs_tot + hrs_oth if hrs_oth<.;
replace hrs_tot = 99 if hrs_tot>99 & hrs_tot<.;
label var hrs_tot "Hours usually worked in all jobs";
notes hrs_tot: These are capped at 99.;

drop usual_hr_oth_nbr;


*** highest qual;

foreach var in has_sch_qual_code highest_qual_code {;
	capture replace `var' = "" if `var'=="NULL";
	destring `var', replace;
};

gen hqual =.;
label define hqual 0 "No qualifications" 1 "School qualifications" 2 "Post-school qualfications"
	3 "Degree or higher";
label val hqual hqual;
label var hqual "Highest qualification";

* highest qual in post 2013 data from highest_qual_code;

replace hqual = 0 if highest_qual_code==22; * no quals;
replace hqual = 1 if highest_qual_code<=21 & highest_qual_code>=13 & highest_qual_code!=15
	& highest_qual_code!=17 & highest_qual_code!=19; * school incl level 3 and below post school;
replace hqual = 2 if (highest_qual_code<=12 & highest_qual_code>=5) | highest_qual_code==15
	| highest_qual_code==17 | highest_qual_code==19; * post school;
replace hqual = 3 if highest_qual_code<=4;  * degree;

* highest postschool qual until q1 2013 from ps_qual_sold_code;

gen ps_mid = ps_qual_sold_code if 
	year<2013 | (year==2013 & quarter==1);
replace ps_mid = subinstr(ps_mid,"10", "1",.); * call "other" post school quals post school;

forvalues i = 1/7 {;
	replace ps_mid = subinstr(ps_mid,"`i'", "1",.); * post schol lower than degree;
};
replace ps_mid = subinstr(ps_mid,"9", "8",.); * bachelor's degree and higher;

gen hqualb = .;
replace hqualb = 3 if strpos(ps_mid,"8")>0;
replace hqualb = 2 if hqual==. & strpos(ps_mid,"1")>0;

* adding in whether school quals to q1 2013 if there are no postschool quls;

replace hqualb = 1 if has_sch_qual_code==1 & hqualb==. & year<2013 | (year==2013 & quarter==1);
replace hqualb = 0 if has_sch_qual_code==2 & hqualb==. & year<2013 | (year==2013 & quarter==1);

* putting pre and post 2013 quals into same var;

replace hqual = hqualb if hqual==.;
drop hqualb;

drop has_sch_qual_code highest_qual_code ps_qual_sold_code;

* making pre and post 2013 quals variables because coding is not consistent;

gen hqual_to12 = hqual if year<=2012 | (year==2013 & quarter==1);
gen hqual_fr13 = hqual if year>=2014 | (year==2013 & quarter>=2 & quarter<=4);
replace hqual_to12 = 0 if year>=2014 | (year==2013 & quarter>=2 & quarter<=4);
replace hqual_fr13 = 0 if year<=2012 | (year==2013 & quarter==1);
drop hqual;

label var hqual_to12 "Highest qualification (up to 2013 q1)";
label var hqual_fr13 "Highest qualification (from 2013 q2)";
notes hqual_to12: Coded as 0 (no qual) from 2013 q2 onwards;
notes hqual_fr13: Coded as 0 (no quals) until 2013 q1;


*** gender;

destring sex_code, force replace;
gen female = sex_code==2;
label var female "Female";
drop sex_code;


*** age, date and place of birth;

destring age_nbr, force replace;
rename age_nbr age;
label var age "Age";

replace country_of_birth_code = trim(country_of_birth_code);
gen nz_born = country_of_birth_code=="1201" if country_of_birth_code!="NULL" 
	& country_of_birth_code!="9999" & country_of_birth_code!="0000" & (year>=2008 | (year==2007 & quarter==4));
replace nz_born = country_of_birth=="1" if country_of_birth_code!="NULL" 
	& (year<2007 | (year==2007 & quarter<4));
label var nz_born "Born in New Zealand";
notes nz_born: High proportion missing from 2016 onwards;
rename country_of_birth_code cob;
label var cob "Country of birth";
notes cob: Coding changes in Q4 of 2007;
destring cob, force replace;


*** household type;

destring household_type_code, force replace;
rename household_type_code hhcomp;
recode hhcomp (15 16 = 1) (1 2 = 2) (10/14 = 3) (3/9 = 4) (17 . = 9);

label define hhcomp 1 "Single, no children" 2 "Couple, no children" 3 "Single, with children"
	4 "Couple with children" 9 "Unidentifiable";
label val hhcomp hhcomp;
label var hhcomp "Household composition";
notes hhcomp: Children include adult children. Non-partner, non-children individuals may also be
	included in any of these households. Note the individual may be the parent or the child in the household.;
	
*** occupation 2009 onwards;
	
rename occ_main_code occ3d;
replace occ3d = "" if occ3d=="NULL";
label var occ3d "Occupation in main job (3-digit ANZSCO 2006)";
destring occ3d, force replace;
replace occ3d = floor(occ3d/1000) if occ3d>999; * coding is 6-digit for 2016-18, but only 3d elsewhere;

gen occ2d = floor(occ3d/10);
label var occ2d "Occupation in main job (2-digit ANZSCO 2006)";


*** occupation pre 2009;

replace nzsco90_code = "" if nzsco90_code=="NULL";
destring nzsco90_code, force replace;
rename nzsco90_code occ3d_old;
replace occ3d_old = 0 if year>=2009 & occ3d<.;
label var occ3d_old "Occupation in main job pre 2009 (3-digit NZSCO 1990)";
notes occ3d_old: Available for years 2006-08;

gen occ2d_old = floor(occ3d_old/10);
label var occ2d_old "Occupation in main job pre 2009 (2-digit NZSCO 1990)";


*** merging in 2-digit and 3-digit consistent codes;

gen nzsco90_3d = occ3d_old;
merge m:1 nzsco90_3d using temp_occ_map, keep(master match) nogen;
merge m:1 nzsco90_3d using temp_occ_map3d, keep(master match) nogen;

replace occ2d = anzsco06_2d if occ2d==.;
notes occ2d: Available for full period, though pre-2009 are from imperfect mapping;

replace occ3d = anzsco06_3d if occ3d==.;
notes occ3d: Available for full period, though pre-2009 are from imperfect mapping;
drop anzsco06_2d anzsco06_3d nzsco90_3d;


*** industry from 2009;

rename ind_main_code ind4;
label var ind4 "Industry (ANZSIC level 4)";
replace ind4 = "" if ind4=="NULL";

gen ind3 = substr(ind4,1,4);
label var ind3 "Industry 2009 onwards (ANZSIC 2006 level 3)";

drop ind4; * this only has more info than ind3 for 2016-18;

notes ind3: Available for years 2009-2018 only;


*** industry pre 2009;

#delimit ;
replace anzsic96_code = "" if anzsic96_code=="NULL";
gen ind3_old = substr(anzsic96_code,1,4);
label var ind3_old "Industry before 2009 (ANZSIC 1996 level 3)";
replace ind3_old = "" if year>=2009;


replace ind3 = "Z" if ind3_old!="";
replace ind3_old = "Z" if ind3!="" & ind3!="Z";


*** merging in level 2 consistent industry codes;

gen anzsic96_l3 = anzsic96_code if year<2009;

merge m:1 anzsic96_l3 using temp_ind_map, keep(master match) nogen;

gen ind2 = substr(ind3,1,3) if year>=2009;
replace ind2 = anzsic06_l2 if year<2009;
label var ind2 "Industry (ANZSIC 2006 level 2)";
notes ind2: Available all years, but pre 2009 was generated from imperfect mapping;
drop anzsic06_l2 anzsic96_l3;


*** multiple jobs - labelled as multi jobs if non-missing actual or usual hours in second job;

destring actual_hr_sec_nbr, force replace;
gen multi_job = (hrs_oth<. & hrs_oth>0) | (actual_hr_sec_nbr<. & actual_hr_sec_nbr>0);
label var multi_job "Works in multiple jobs";
notes multi_job: As indicated by non-zero, non-missing usual or actual hours worked in other jobs; 


*** collective employment agreement;

#delimit ;
gen collec = 1 if emp_agree_type_code=="02";
replace collec = 0 if emp_agree_type_code=="01";
label var collec "Collective employment agreement (2016 onwards only)";
drop emp_agree_type_code;


*** full time, labour force status;

gen ft = 1 if ft_pt_code=="1";
replace ft = 0 if ft_pt_code=="2";
label var ft "Works full time";
notes ft: missing if does not work;
drop ft_pt_code;

gen lfs = 1 if lfs_code=="1";
replace lfs = 2 if lfs_code=="2";
replace lfs = 3 if lfs_code=="3";
label define lfs 1 "Employed" 2 "Unemployed" 3 "Not in labour force";
label val lfs lfs;
label var lfs "Labour force status";
tostring lfs, replace;
drop lfs_code;



*** identifying parents of children;

capture destring fam_parent_role_ind, force replace;
replace fam_parent_role_ind = 0 if fam_parent_role_ind!=1;
capture destring fam_num_dep_nbr, force replace;
replace fam_num_dep_nbr = 0 if fam_num_dep_nbr>=99;
gen par_u18_ct = fam_parent_role_ind*fam_num_dep_nbr;
drop fam_parent_role_ind fam_num_dep_nbr;



*** other;

rename regional_council_code rc;
label var rc "Regional council";


notes snz_uid: From [IDI_Clean_20181020];


capture drop nucleus_nbr;
drop ps_mid;


compress;
#delimit ;
order snz_uid year quarter female age;
notes: Wgap_HLFS_IDI_quart_v3_recent.dta was created from Wgap_HLFS_IDI_v3_recent.csv by `outfile'.do.
An observation is an HLFS individual in a survey quarter.;
save Wgap_HLFS_IDI_quart_v3_recent, replace;








*2*******************************************************************************;
* individual-year level data excluding firm characteristics;
********************************************************************************;
	

*2b*******************************************************************************;
* main file of worker information;

forvalues i = 1/5 {; 

	local min = 10000000*(`i'-1)+1;
	local max = 10000000*`i';

	import delimited using "Wgap_v3_emp_yr4.csv", varn(1) case(preserve) 
		clear rowr(`min':`max') stringc(1);
	capture drop if snz_ird_uid==.;
	capture drop if snz_ird_uid=="";
		
	destring tot_fte_WP_av min_age, force replace;

	drop if WP==1;
	notes: Excludes individuals who are ever WPs at any pent.;

	replace hp_pent = trim(hp_pent);
	compress;
	
	label var financial_year "Financial year";
	notes financial_year: Year ending in March of stated calendar year.;
	label var tot_gross_earn_yr "Individual's total wage/salary earnings for year";
	label var max_gross_earn_yr "Individual's wage/salary earnings from highest-paying pent in year";
	label var WP "Ever a working proprietor at any pent";
	label var tot_fte_employee_av "Average monthly non-WP employee FTEs of employee";
	notes tot_fte_employee_av: Values over 1 are missing.;
	label var max_fte_employee_av "Average monthly non-WP employee FTEs of employee at hp pent";
	notes max_fte_employee_av: The average is over 12 months regardless of months worked there.;
	label var tot_fte_WP_av "Average monthly pseudo-FTEs as a WP";
	label var tot_mon_wkd "Total number of calendar months worked";
	label var max_mon_wkd "Number of calendar months worked at highest paying pent";
	label var max_mon_wkd_ft "Number of calendar months worked full time at highest paying pent";
	notes max_mon_wkd_ft: Full time is defined as having FTE of 1.;
	label var max_mon_wkd_pt "Number of calendar months worked part time at highest paying pent";
	notes max_mon_wkd_pt: Part time is defined as having FTE of less than 1.;
	label var num_pents "Number of pents worked at during year";
	label var min_age "Age at start of year";
	label var female "Female";
	label var hp_pent "Highest paying pent in year";
	label var wkd_hpp_1ya "Worked for highest paying pent in previous year";
	label var wkd_hpp_2ya "Worked for highest paying pent in year before last";
	
	compress;
	save Wgap_v3_emp_yr4_`i', replace;
};







*3*******************************************************************************;
* pent-March-year level data for merging with individual level data;
********************************************************************************;

*3a*******************************************************************************;
* firm size data;

import delimited using "Wgap_v3_pent_yrA1.csv", varn(1) case(preserve) clear;

keep pent year hcy_wagedWP_t hcy_unwgdWP_t hc_t hc_m hc_f hc_t_ft hc_t_pt  
	fte_t fte_m fte_f;

drop if year==.;
compress pent;

save tempA1, replace;

import delimited using "Wgap_v3_pent_yrA2.csv", varn(1) case(preserve) clear;

keep pent year hcy_wagedWP_t hcy_unwgdWP_t hc_t hc_m hc_f hc_t_ft hc_t_pt  
	fte_t fte_m fte_f;

drop if year==.;
compress pent;

append using tempA1;


drop if year>9999; * dropping data corresponding to non-March years;
drop if year>2018 | year<=2000; * Not in sample;

gen L_hc = hc_t/12;
label var L_hc "Average non-WP employee head count at pent"; 

gen L_hc_ft = hc_t_ft/12*(hc_t/(hc_t_ft + hc_t_pt));
label var L_hc_ft "Average ft non-WP employee head count at pent";
gen L_hc_pt = hc_t_pt/12*(hc_t/(hc_t_ft + hc_t_pt));
label var L_hc_pt "Average pt non-WP employee head count at pent";


foreach g in m f {;
	gen av_fte_`g' = 12*fte_`g'/hc_`g';	
	replace av_fte_`g' = 1 if hc_`g'==0;
	label var av_fte_`g' "Average monthly FTEs of `g' in the pent";
};

gen s_f = hc_f/(hc_f + hc_m);
label var s_f "Female share of pent head count";

gen s_f_fte = fte_f/(fte_f + fte_m);
label var s_f_fte "Female share of pent FTEs";

drop hc_m hc_f fte_m fte_f;

label var year "March year";

gen L_WP = hcy_wagedWP_t + hcy_unwgdWP_t;
label var L_WP "Working proprietor head count";
notes L_WP: Includes waged and non-waged WPs. A waged WP is a person who is ever
	a WP at the pent and at some point in the year earned wage income from the pent.
	A non-waged WP is a person who is WP of the pent in the year and did not earn
	wage income from the pent in the year, and did earn non-wage income from the
	pent in the year.;
drop hcy_wagedWP_t hcy_unwgdWP_t;

rename fte_t L_fte;
label var L_fte "Non-WP employee FTEs";

drop hc_t hc_t_ft hc_t_pt;

tab year;
compress;
save temp_pent_indiv_v3, replace;




*3b*******************************************************************************;
* Firm multiplant indicator;

import delimited using "Wgap_v3_loc.csv", varn(1) case(preserve) 
	clear; 
	
drop if year==.;
compress pent;

drop if year>9999; * dropping data corresponding to non-March years;
drop if year>2018 | year<=2000; * Not in sample - need to check when sample ends;

label var multiplant "Firm had multiple plants in 2+ months";
label var year "March year";

merge 1:1 pent year using temp_pent_indiv_v3, keep(master match using);

#delimit ;
tab year _m;
capture drop _m;
save temp_pent_indiv2_v3, replace;





*3c*******************************************************************************;
* pent industry;

 
import delimited using "Wgap_v3_indy.csv", varn(1) case(preserve) 
	clear; 

drop if _n==_N;
compress;

label var anz06_4d "Industry";

replace anz06_4d = "" if anz06_4d=="NULL";

compress;
save temp_indy_v3, replace;




*3d*******************************************************************************;
* business type and institutional sector;


use temp_indy_v3, clear;

save temp_indy_BTIS_v3, replace; * used for firm-level data set;



use temp_indy_BTIS_v3, clear;
merge 1:m pent using temp_pent_indiv2_v3, keep(master match using);
drop if _m==1;
drop _m;

compress;
save temp_pent_indiv3_v3, replace;








*3e*******************************************************************************;
* list of firms with prody data to link with individual data;

import delimited using "Wgap_v3b_prody.csv", varn(1) case(preserve) 
	clear;

keep if dim_year_key>=200103 & dim_year_key<=201803;

gen year = floor(dim_year_key/100);

* keep firms with non-missing gross output;
keep if go_nom<.;

keep pent year;

gen in_prody_dat = 1;
label var in_prody_dat "pent-year is in productivity data";

compress;
save temp_in_prod_v3, replace;






*4*******************************************************************************;
* merging firm March year data onto individual data;
********************************************************************************;

forvalues i = 1/1 {;
*forvalues i = 1/5 {; * need to run this over i = 1/5, but fake data don't exist;

	use Wgap_v3_emp_yr4_`i', clear;
	
	
	* dropping obs that we know are someone being employed, but we don't know who;
	
	drop if snz_ird_uid==0;
	
	* dropping years very not in sample;
	
	rename financial_year year;
	drop if year>2018 | year<=2000; 
	
	
	* getting rid of missing values;
	
	replace tot_fte_employee_av  = . if tot_fte_employee_av>1;
	replace max_fte_employee_av = . if max_fte_employee_av>1;
	
	* merging in firm data;
	
	rename hp_pent pent;
	di _n "Merging on firm data";
	merge m:1 pent year using temp_pent_indiv3_v3, keep(master match) nogen;
	codebook L_hc;

	merge m:1 pent year using temp_in_prod_v3, keep(master match) nogen;
	replace in_prody_dat = 0 if in_prody_dat==.;


	rename pent hp_pent;
	compress;
	
	sort snz_uid year;
	set seed 8719;
	gen rand = runiform();
	label var rand "stable random number for observation";
	
	label var year "March year";
	
	compress;
	notes: Wgap_v3_ind`i'.dta was created by `outfile'.do. An observation is a person
		who worked in a March year, and who was never a WP at any firm. Years outside
		2001 to 2018 are dropped, but these years are not fullly complete. Contains individual 
		information and information on the pent
		that paid the person most wages in the year. This is file `i' of 5, split
		for size reasons.;
	drop tot_fte_WP_av;
	
	compress;
	
	* tenure vars missing in first year;
	
	foreach var in wkd_hpp_1ya wkd_hpp_2ya {;
		replace `var' = . if year<=2001;
	};
	
	save Wgap_v3_ind`i', replace;
	
};


/* This creates fake data to test the code */

use Wgap_v3_ind1, clear;
forvalues i = 2/5 {;
	save Wgap_v3_ind`i', replace;
};






*5*******************************************************************************;
* additional pent-level data;
********************************************************************************;

*5a******************************************************************************;
* Firm multiplant indicator for financial years;

import delimited using "Wgap_v3_loc.csv", varn(1) case(preserve) 
	clear; 
	
drop if year==.;
compress pent;

* getting rid of March year observations for firms with non-March year financial years;	

gen yr2 = year if year<9999;
replace yr2 = floor(year/100) if yr2==.;
bys pent yr2: drop if _N==2 & year<9999;
drop year;
rename yr2 year;
label var year "Pent's financial year";

label var multiplant "Firm had multiple plants in 2+ months";
label var year "March year";

drop if year<2001 | year>2018;

compress;
save temp_RC_v3, replace;






*5b******************************************************************************;
* Firm productivity data;

#delimit ;
import delimited using "Wgap_v3b_prody.csv", varn(1) case(preserve) clear; 
	
gen year = floor(dim_year_key/100);
drop dim_year_key;	
drop if year==.;
compress pent;
label var year "financial year";
drop if year<2001 | year>2016;

label var pf_ind "Industry";
notes pf_ind: These are the industry codes from the productivity data, based on the
	Annual Enterprise Survey forms and BOS industry categorisations.;
	
label var anz06_4d "Level 4 industry";


gen L_Rich = exp(lnL);
label var L_Rich "Richard's head count";
drop lnL;

* shifting RLR_num from K to M;

#delimit ;
replace K_nom = K_nom - RLR_nom;
replace M_nom = M_nom + RLR_nom;


* merging in CPI to use as deflators for everything;

#delimit ;
merge m:1 year using CPI, keep(master match) keepus(deflator) nogen;


* generating real and log real values;

foreach var in K M go {;
	gen `var'_real = `var'_nom*deflator;
	gen ln`var' = ln(`var'_real);
	label var ln`var' "`var' (ln real 2006 $ using CPI)";
};

* making an indicator for any of M, K, go change more than 400% from previous year;

sort pent year;

foreach var in M K go {;

	gen o400_ch_`var' = 0;
	replace o400_ch_`var' = 1 if pent[_n-1]==pent & year[_n-1] + 1==year 
		& (abs(`var'_real - `var'_real[_n-1])/`var'_real[_n-1] > 4) & `var'_real<. & `var'_real[_n-1]<.;
	label var o400_ch_`var' "`var'_real changed more than 400% from prev yr";
};

gen o400_ch = max(o400_ch_M,o400_ch_K,o400_ch_go);
label var o400_ch "M_real, K_real, or go_real changed more than 400% from prev yr";

* making an indicator for any of M, K, go is above the 99.9th percentile for firms with 5+ head count according to richard;

foreach var in M K go {;

	gen o99p_`var' = 0;
	sum `var'_real if L_Rich>=5 & L_Rich<., d;
	replace o99p_`var' = 1 if `var'_real<. & `var'_real>r(p99);
	label var o99p_`var' "`var'_real is over 99th percentile for firms with hc>=5";
	
	gen o999p_`var' = 0;
	sum `var'_real if L_Rich>=5 & L_Rich<. & o99p_`var' == 1, d;
	replace o999p_`var' = 1 if `var'_real<. & `var'_real>r(p90);
	label var o999p_`var' "`var'_real is over 99.9th percentile for firms with hc>=5";
	if "`var'"!="K" drop o99p_`var';
};


gen o999p = max(o999p_M,o999p_K,o999p_go);
label var o999p "M_real, K_real, or go_real is over 99.9th percentile";

keep pent year go_nom M_nom K_nom K_real lnK M_real lnM go_real lngo o400_ch o999p anz06_4d pf_ind o400_ch_K o99p_K;
compress;

notes: GWgap_readin_v3b_prody.dta was created by `outfile'.do on $S_DATE at $S_TIME.;
save GWgap_readin_v3b_prody, replace;









*6*******************************************************************************;
* pent-level data on employee shares;
********************************************************************************;

*6a***************************************;
* reading in and saving the various files;

foreach file in A1 _age1 _ten {; 
	import delimited using "Wgap_v3_pent_yr`file'.csv", varn(1) case(preserve) 
		clear;
	if "`file'"=="A1" {;
		save temp, replace;
		import delimited using "Wgap_v3_pent_yrA2.csv", varn(1) case(preserve) 
			clear;
		append using temp;
	};
	if "`file'"=="_age1" {;
		save temp_age, replace;
		import delimited using "Wgap_v3_pent_yr_age2.csv", varn(1) case(preserve) 
			clear;
		append using temp_age;
	};
	
		
	drop if year==.;
	compress pent;
		
	* drop pents not well identified;

	drop if substr(pent,1,2)=="ZZ";


	* dropping years without data;

	drop if year==199903 | year==200003 | year>=202003 | year<=2000 | year==2019  
		| year==2020;
		
	* getting rid of March year observations for firms with non-March year financial years;	

	gen yr2 = year if year<9999;
	replace yr2 = floor(year/100) if yr2==.;
	bys pent yr2: drop if _N==2 & year<9999;
	drop year;
	rename yr2 year;
	label var year "Pent's financial year";
	order pent year;
	compress;
	
	if "`file'"=="A1" local file A;
	if "`file'"=="_age1" local file _age;
	
	save temp_pent_yr`file'_v3, replace;
	
};




*6b***************************************;
* looping over firms with head count>=5 and those with head count <5;

foreach headct in "" _hclt5 {;

	if "`headct'"=="" {; *head count 5+;
		local headctc "hc_t>=60";
		local headctl "5+";
	};
	if "`headct'"=="_hclt5" {; *head count <5;
		local headctc "hc_t<60";
		local headctl "<5";
	};

	*6b***************************************;
	* Combining data on employee shares by pent, creating variables to use;

	use if `headctc' using temp_pent_yrA_v3, clear;

	notes: Includes firms with average head count of `headctl';

	label var hc_t "12 x avg monthly head count";
	label var hc_f "12 x avg monthly female head count";
	label var hc_m "12 x avg monthly male head count";


	* scaling up male and female head counts so they sum to total if missing genders are <10% of hc;
	* and setting gender-specific head counts missing if 10%+ of head count has missing gender;

	gen hc_s = hc_m + hc_f;

	foreach g in m f {;
		gen L_hc_`g' = hc_`g'*hc_t/(12*hc_s) if (hc_t - hc_s)/hc_t<=0.1;
	};
	label var L_hc_m "Male employee head count (avg)";
	label var L_hc_f "Female employee head count (avg)";
	gen L_hc_t = hc_t/12;
	drop hc_s;


	* scaling up male and female FTEs so they sum to total if discrepancy is <0.1;
	* and setting both missing if the discrepancy is larger;

	gen fte_s = fte_m + fte_f;

	foreach g in m f {;
		gen L_fte_`g' = fte_`g'*fte_t/fte_s if (fte_t - fte_s)/fte_t<=0.1;
	};
	label var L_fte_f "Total female FTEs";
	label var L_fte_m "Total male FTEs";
	gen L_fte_t = fte_t;
	drop fte_s;


	* generating average male and average female FTEs, setting to 1 if no such workers;

	gen av_fte_m = L_fte_m/L_hc_m;
	replace av_fte_m = 1 if L_hc_m==0;
	gen av_fte_f = L_fte_f/L_hc_f;
	replace av_fte_f = 1 if L_hc_f==0;
	label var av_fte_m "Average male FTEs";
	label var av_fte_f "Average female FTEs";


	* generating male and female part time and full time labour inputs;

	gen hc_time_s = hc_m_ft + hc_f_ft + hc_m_pt + hc_f_pt;
	foreach ty in m_ft f_ft m_pt f_pt {;
		gen L_hc_`ty' = hc_`ty'*hc_t/(12*hc_time_s) if (hc_t - hc_time_s)/hc_t<=0.1;
	};
	drop hc_time_s;
	label var L_hc_m_ft "Full time male employee hc";
	label var L_hc_f_ft "Full time female employee hc";
	label var L_hc_m_pt "Part time male employee hc";
	label var L_hc_f_pt "Part time female employee hc";



	**********************************;
	* merging in age share variables, part time and full time;

	merge 1:1 pent year using temp_pent_yr_age_v3, keepus(hc_t_* fte_t_*) keep(master match) nogen;


	* head counts and ftes of labour in each age category: scaling if <10% missing, else dropping;

	gen sum_hc_age = 0;
	gen sum_fte_age = 0;
	gen hc_t_55p = hc_t_55to64 + hc_t_65p;
	gen fte_t_55p = fte_t_55to64 + fte_t_65p;
	drop hc_t_55to64 hc_t_65p fte_t_55to64 fte_t_65p;

	foreach cat in lt25 25to39 40to54 55p {;
		replace sum_hc_age = sum_hc_age + hc_t_`cat';
		replace sum_fte_age = sum_fte_age + fte_t_`cat';
	};
	foreach cat in lt25 25to39 40to54 55p {;
		gen L_hc_`cat' = hc_t_`cat'*(hc_t/sum_hc_age)/12 if (hc_t - sum_hc_age)/hc_t<=0.1;
		label var L_hc_`cat' "Employee head count aged `cat'";
		gen av_fte_`cat' = fte_t_`cat'*(fte_t/sum_fte_age)/L_hc_`cat' if (fte_t - sum_fte_age)/fte_t<0.1;
		replace av_fte_`cat' = 1 if L_hc_`cat'==0;
		label var av_fte_`cat' "Average FTEs of workers aged `cat'";
		drop hc_t_`cat' fte_t_`cat';
	};
	drop sum_hc_age sum_fte_age;


	* head cts & ftes of labour in each age category X part/full time: scaling if <10% missing, else dropping;

	gen sum_hc_timeage = 0;
	gen hc_t_55p_ft = hc_t_55to64_ft + hc_t_65p_ft;
	gen hc_t_55p_pt = hc_t_55to64_pt + hc_t_65p_pt;
	drop hc_t_55to64_ft  hc_t_65p_ft hc_t_55to64_pt hc_t_65p_pt;

	foreach cat in lt25 25to39 40to54 55p {;
		replace sum_hc_timeage = sum_hc_timeage + hc_t_`cat'_ft + hc_t_`cat'_pt;
	}; 
	foreach cat in lt25 25to39 40to54 55p {;
		foreach time in ft pt {;
			gen L_hc_`cat'_`time' = hc_t_`cat'_`time'*(hc_t/sum_hc_timeage)/12 if 
				(hc_t - sum_hc_timeage)/hc_t<=0.1;
			label var L_hc_`cat'_`time' "Employee head count `time' aged `cat'";
			drop hc_t_`cat'_`time';
		};
	};
	drop sum_hc_timeage;

	save templl_v3, replace;



	drop hc_m hc_f hc_t_ft hc_m_ft hc_f_ft hc_t_pt hc_m_pt hc_f_pt ws_m ws_f fte_m fte_f;

	
	**********************************;
	* merging in tenure share variables, part time and full time;

	merge 1:1 pent year using temp_pent_yr_ten_v3, keepus(hc_t_* fte_t_*) keep(master match) nogen;

	* setting all tenure vars equal to . if year==2001;

	unab ten: *ten*;
	foreach var in `ten' {;
		replace `var' = . if year<=2001;
	};

	* head counts and ftes of labour in each tenure category: scaling if <10% missing, else dropping;

	gen sum_hc_ten = 0;
	gen sum_fte_ten = 0;


	foreach cat in ten0 ten1 ten2 {;
		replace sum_hc_ten = sum_hc_ten + hc_t_`cat';
		replace sum_fte_ten = sum_fte_ten + fte_t_`cat';
	};
	foreach cat in ten0 ten1 ten2 {;
		gen L_hc_`cat' = hc_t_`cat'*(hc_t/sum_hc_ten)/12 if (hc_t - sum_hc_ten)/hc_t<=0.1;
		label var L_hc_`cat' "Employee head count with tenure `cat'";
		gen av_fte_`cat' = fte_t_`cat'*(fte_t/sum_fte_ten)/L_hc_`cat' if (fte_t - sum_fte_ten)/fte_t<0.1;
		replace av_fte_`cat' = 1 if L_hc_`cat'==0;
		label var av_fte_`cat' "Average FTEs of workers with tenure `cat'";
		drop hc_t_`cat' fte_t_`cat';
	};
	drop sum_hc_ten sum_fte_ten;


	* head cts & ftes of labour in each tenure category X part/full time: scaling if <10% missing, else dropping;

	gen sum_hc_timeten = 0;

	foreach cat in ten0 ten1 ten2 {;
		replace sum_hc_timeten = sum_hc_timeten + hc_t_`cat'_ft + hc_t_`cat'_pt;
	}; 
	foreach cat in ten0 ten1 ten2 {;
		foreach time in ft pt {;
			gen L_hc_`cat'_`time' = hc_t_`cat'_`time'*(hc_t/sum_hc_timeten)/12 if 
				(hc_t - sum_hc_timeten)/hc_t<=0.1;
			label var L_hc_`cat'_`time' "Employee head count `time' with tenure `cat'";
			drop hc_t_`cat'_`time';
		};
	};
	drop sum_hc_timeten;




	* working proprietor counts;

	gen WP_t = hcy_wagedWP_t + hcy_unwgdWP_t;
	drop hc_wagedWP_t - ws_wagedWP_f;
	label var WP_t "Working proprietor head count";
	
	gen WP_cat = 0 if WP_t==0;
	replace WP_cat = 1 if WP_t>0 & WP_t<=1;
	replace WP_cat = 2 if WP_t>1 & WP_t<=5;
	replace WP_cat = 3 if WP_t>5 & WP_t<=10;
	replace WP_cat = 4 if WP_cat==. & WP_t<.;

	label define WP_cat 0 "0" 1 "1" 2 "2-5" 3 "6-10" 4 ">10";
	label val WP_cat WP_cat;
	label var WP_cat "Working proprietor headcount (categories)";


	* merging in CPI to make WB real;
	
	merge m:1 year using CPI, keepus(deflator) nogen keep(master match);

	gen lnWB = ln(ws_t*deflator);
	label var lnWB "Total wage bill (ln real)";
	notes lnWB: In real 2006 $;
	drop hc_t ws_t;



	* merging in multiplant data and productivity data;

	merge 1:1 pent year using temp_RC_v3, keep(master match) nogen;
	merge 1:1 pent year using GWgap_readin_v3b_prody, keep(master match); 
	gen in_prody = 1 if _m==3;
	label var in_prody "In productivity data";

	gen ind2c = substr(anz06_4d,1,3);
	egen ind2 = group(ind2c);
	label var ind2 "2-digit ANZSIC06 industry";
	drop ind2c;

	egen pf_ind_gp = group(pf_ind);
	label var pf_ind_gp "Productivity industry group code";
	gen rand = runiform();
	label var rand "Stable random number";

	xi i.year, pre(Y);
	xi i.pf_ind_gp, pre(IN);
	xi i.WP_cat, pre(WPc_);


	label var months_empl "Number of months pent employed any non-WP";



	compress;
	notes: GWgap_readin_v3b_firm`headct'.dta was created by `outfile'.do in $S_DATE $S_TIME. It is a firm-year
	level data set for use in pf/wb estimation, with a subset of breakdowns by age group and tenure.;
	notes:  GWgap_readin_v3b_firm`headct'.dta contains data on firms with `headctl' head count whether or not in prody data.;
	save GWgap_readin_v3b_firm`headct', replace;

};





*6c***************************************;
* Making separate files for additional labour breakdowns: age;

use temp_pent_yr_age_v3, clear;

* merging on total head count and fte to compare the sum against;

merge 1:1 pent year using temp_pent_yrA_v3, keepus(hc_t fte_t) nogen;


drop ws_* hc_t_* fte_t_*;


* merging age categories 55 to 64 and 65+;

foreach g in m f {;
	foreach ty in hc fte {;
		gen `ty'_`g'_55p = `ty'_`g'_55to64 + `ty'_`g'_65p;
		
	};
	gen hc_`g'_55p_pt = hc_`g'_55to64_pt + hc_`g'_65p_pt;
	gen hc_`g'_55p_ft = hc_`g'_55to64_ft + hc_`g'_65p_ft;
};
drop *55to64* *65p*;



* head counts and ftes of labour in each genderXage category: scaling if <10% missing, else dropping;

gen sum_hc_gage = 0;
gen sum_fte_gage = 0;

foreach cat in lt25 25to39 40to54 55p {;
	replace sum_hc_gage = sum_hc_gage + hc_m_`cat' + hc_f_`cat';
	replace sum_fte_gage = sum_fte_gage + fte_m_`cat' + fte_f_`cat';
};
foreach cat in lt25 25to39 40to54 55p {;
	foreach g in m f {;
	
		if "`g'"=="m" {;
			local gender "Male";
			local gender2 "males";
		};
		if "`g'"=="f" {;
			local gender "Female";
			local gender2 "females";
		};
		
		gen L_hc_`g'_`cat' = hc_`g'_`cat'*(hc_t/sum_hc_gage)/12 if (hc_t - sum_hc_gage)/hc_t<=0.1;
		label var L_hc_`g'_`cat' "`gender' employee head count aged `cat'";
		gen av_fte_`g'_`cat' = fte_`g'_`cat'*(fte_t/sum_fte_gage)/L_hc_`g'_`cat' 
			if (fte_t - sum_fte_gage)/fte_t<0.1;

		replace av_fte_`g'_`cat' = 1 if L_hc_`g'_`cat'==0;
		label var av_fte_`g'_`cat' "Average FTEs of `gender2' aged `cat'";
		drop hc_`g'_`cat' fte_`g'_`cat';
	};
};
drop sum_hc_gage sum_fte_gage;


* head cts & ftes of labour in each gender X age category X part/full time: scaling 
* if <10% missing, else dropping;

gen sum_hc_gtimeage = 0;

foreach cat in lt25 25to39 40to54 55p {;
	replace sum_hc_gtimeage = sum_hc_gtimeage + hc_m_`cat'_ft + hc_m_`cat'_pt
		+ hc_f_`cat'_ft + hc_f_`cat'_pt;
}; 
foreach cat in lt25 25to39 40to54 55p {;
	foreach time in ft pt {;
		foreach g in m f {;
		
			if "`g'"=="m" local gender "Male";
			if "`g'"=="f" local gender "Female";

			gen L_hc_`g'_`cat'_`time' = hc_`g'_`cat'_`time'*(hc_t/sum_hc_gtimeage)/12 if 
				(hc_t - sum_hc_gtimeage)/hc_t<=0.1;
			label var L_hc_`g'_`cat'_`time' "`gender' employee head count `time' aged `cat'";
			drop hc_`g'_`cat'_`time';
		};
	};
};
sum hc_t;
keep if hc_t>=60;
drop sum_hc_gtimeage hc_t fte_t;

notes: Head count and FTE measures broken down by genderXage and genderXageXptft have been
adjusted to account for missing;

compress;
notes: GWgap_readin_v3b_firm_age.dta was created by `outfile'.do in $S_DATE $S_TIME. It is a firm-year
level data set for merging with GWgap_readin_v3b_firm.dta to then be used in pf/wb estimation. It contains
additional breakdowns by age interacted with gender and part/full time.;

notes: GWgap_readin_v3b_firm_age.dta contains data on firms with 5+ head count whether or not in prody data.;
save GWgap_readin_v3b_firm_age, replace;






*6d***************************************;
* Making separate files for additional labour breakdowns: tenure;

use temp_pent_yr_ten_v3, clear;

* merging on total head count and fte to compare the sum against;

merge 1:1 pent year using temp_pent_yrA_v3, keepus(hc_t fte_t) nogen;
#delimit ;
drop ws_* hc_t_* fte_t_*;

* tenure variables can't be defined in 2001;
drop if year==2001;



* head counts and ftes of labour in each genderXtenure category: scaling if <10% missing, else dropping;

gen sum_hc_gten = 0;
gen sum_fte_gten = 0;

foreach cat in ten0 ten1 ten2 {;
	replace sum_hc_gten = sum_hc_gten + hc_m_`cat' + hc_f_`cat';
	replace sum_fte_gten = sum_fte_gten + fte_m_`cat' + fte_f_`cat';
};
foreach cat in ten0 ten1 ten2 {;
	foreach g in m f {;
	
		if "`g'"=="m" {;
			local gender "Male";
			local gender2 "males";
		};
		if "`g'"=="f" {;
			local gender "Female";
			local gender2 "females";
		};
		
		gen L_hc_`g'_`cat' = hc_`g'_`cat'*(hc_t/sum_hc_gten)/12 if (hc_t - sum_hc_gten)/hc_t<=0.1;
		label var L_hc_`g'_`cat' "`gender' employee head count with tenure `cat'";
		gen av_fte_`g'_`cat' = fte_`g'_`cat'*(fte_t/sum_fte_gten)/L_hc_`g'_`cat' 
			if (fte_t - sum_fte_gten)/fte_t<0.1;

		replace av_fte_`g'_`cat' = 1 if L_hc_`g'_`cat'==0;
		label var av_fte_`g'_`cat' "Average FTEs of `gender2' with tenure `cat'";
		drop hc_`g'_`cat' fte_`g'_`cat';
	};
};
drop sum_hc_gten sum_fte_gten;


* head cts & ftes of labour in each gender X tenure category X part/full time: scaling 
* if <10% missing, else dropping;

gen sum_hc_gtimeten = 0;

foreach cat in ten0 ten1 ten2 {;
	replace sum_hc_gtimeten = sum_hc_gtimeten + hc_m_`cat'_ft + hc_m_`cat'_pt
		+ hc_f_`cat'_ft + hc_f_`cat'_pt;
}; 
foreach cat in ten0 ten1 ten2 {;
	foreach time in ft pt {;
		foreach g in m f {;
		
			if "`g'"=="m" local gender "Male";
			if "`g'"=="f" local gender "Female";

			gen L_hc_`g'_`cat'_`time' = hc_`g'_`cat'_`time'*(hc_t/sum_hc_gtimeten)/12 if 
				(hc_t - sum_hc_gtimeten)/hc_t<=0.1;
			label var L_hc_`g'_`cat'_`time' "`gender' employee head count `time' with tenure `cat'";
			drop hc_`g'_`cat'_`time';
		};
	};
};
sum hc_t;
keep if hc_t>=60;
drop sum_hc_gtimeten hc_t fte_t;

notes: Head count and FTE measures broken down by genderXtenure and genderXtenureXptft have been
adjusted to account for missing;

compress;
notes: GWgap_readin_v3b_firm_ten.dta was created by `outfile'.do in $S_DATE $S_TIME. It is a firm-year
level data set for merging with `outfile'_firm.dta to then be used in pf/wb estimation. It contains
additional breakdowns by tenure interacted with gender and part/full time.;

notes: GWgap_readin_v3b_firm_ten.dta contains data on firms with 5+ head count whether or not in prody data.;
save GWgap_readin_v3b_firm_ten, replace;






